#setwd('/afs/inf.ed.ac.uk/user/s17/s1725186/Documents/PhD-Models/FirstPUModel/RMarkdowns')
library(tidyverse) ; library(reshape2) ; library(glue) ; library(plotly) ; library(dendextend)
library(RColorBrewer) ; library(viridis) ; require(gridExtra) ; library(GGally) ; library(gplots)
library(WGCNA)
library(expss)
library(polycor)
library(knitr)
SFARI_colour_hue = function(r) {
pal = c('#FF7631','#FFB100','#E8E328','#8CC83F','#62CCA6','#59B9C9','#b3b3b3','#808080','gray','#d9d9d9')[r]
}
clustering_selected = 'DynamicHybridMergedSmall'
#clustering_selected = 'DynamicHybrid'
cat(paste0('Using clustering ', clustering_selected))
## Using clustering DynamicHybridMergedSmall
Load preprocessed dataset (preprocessing code in 19_10_14_data_preprocessing.Rmd) and clustering (pipeline in 19_10_15_WGCNA.Rmd)
# Gandal dataset
load('./../Data/Gandal/preprocessed_data.RData')
datExpr = datExpr %>% data.frame
DE_info = DE_info %>% data.frame
# GO Neuronal annotations
GO_annotations = read.csv('./../../../PhD-InitialExperiments/FirstYearReview/Data/GO_annotations/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID'=as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neuronal'=1)
# SFARI Genes
SFARI_genes = read_csv('./../Data/SFARI/SFARI_genes_08-29-2019_with_ensembl_IDs.csv')
SFARI_genes = SFARI_genes[!duplicated(SFARI_genes$ID) & !is.na(SFARI_genes$ID),]
# Update DE_info with SFARI and Neuronal information
genes_info = DE_info %>% mutate('ID'=rownames(.)) %>% left_join(SFARI_genes, by='ID') %>%
mutate(`gene-score`=ifelse(is.na(`gene-score`), 'None', `gene-score`)) %>%
left_join(GO_neuronal, by='ID') %>%
mutate(Neuronal=ifelse(is.na(Neuronal), 0, Neuronal)) %>%
mutate(gene.score=ifelse(`gene-score`=='None' & Neuronal==1, 'Neuronal', `gene-score`),
significant=padj<0.05 & !is.na(padj))
# Dataset created with DynamicTreeMerged algorithm
dataset = read.csv(paste0('./../Data/Gandal/dataset_', clustering_selected, '.csv'))
dataset$Module = dataset[,clustering_selected]
# GO DEA
# load('./../Data/Gandal/GO_DE_clusters.RData')
rm(DE_info, GO_annotations, clusterings)
Check if for each gene, the module with the smallest distance is their corresponding module
mm = dataset %>% dplyr::select(starts_with('MM.'), starts_with('MMgray')) %>% dplyr::rename(MM.gray = MMgray)
rownames(mm) = dataset$ID
original_max_membership = gsub('MM.', '', colnames(mm)[max.col(mm, ties.method='first')])
cat(paste0('For ', round(100*mean(original_max_membership == gsub('#','',dataset$Module))),
'% of the genes, their assigned module corresponds to the module with the highest Module Membership'))
## For 53% of the genes, their assigned module corresponds to the module with the highest Module Membership
Apparently this is not the case. Someone asked this in a Bioconductor question and Peter Langfelder answered that this is because WGCNA assigns module labels using dynamic tree cut of hierarchical clustering tree that is based on the Toplogical Overlap Measure. TOM results in similar but not quite the same similarity as correlation, hence for some genes the assigned module may differ from the module with highest kME. But that in all, he doesn’t worry about the module assignment vs. max. kME differences in his own analyses, and he recommends not worrying about it to others as well.
Before following his advice and stop worrying about it I’m going to try to understand better when these two modules don’t match
The palette on the top of the heatmap represents the size of the module, the darker the colour, the larger the module
module_size = dataset %>% group_by(Module) %>% tally %>% mutate(Module = gsub('#', '', Module))
module_size$quant = cut(module_size$n, breaks=9, labels=FALSE)
module_size$meanExpr = sapply(module_size$Module, function(m){mean(rowMeans(datExpr)[dataset$Module==m | dataset$Module==paste0('#',m)])})
module_size$quantME = cut(module_size$meanExpr, breaks=9, labels=FALSE)
heatmap.2(table(original_max_membership, gsub('#','',dataset$Module)), symm=TRUE, dendrogram='none', keysize=1,
trace='none', scale='none', col=brewer.pal(9,'YlGnBu'), xlab='Assigned Module', ylab='Highest MM',
ColSideColors = brewer.pal(9,'PuBu')[module_size$quant])
Scaling by rows it’s easier to see that genes assigned to the larger modules sometimes have a higher module membership with smaller modules
heatmap.2(table(original_max_membership, gsub('#','',dataset$Module)), dendrogram='none', keysize=1, symm=TRUE,
trace='none', scale='row', col=brewer.pal(9,'YlGnBu'), xlab='Assigned Module', ylab='Highest MM',
ColSideColors=brewer.pal(9,'PuBu')[module_size$quant])
Module Membership by gene (selecting a random sample so it’s not that heavy)
The membership of some module seems to be related to the level of expression of the genes
set.seed(123)
plot_mm = mm %>% sample_frac(0.05) %>% as.matrix
colnames(plot_mm) = gsub('MM.','', colnames(plot_mm))
plot_mm = plot_mm[order(rowMeans(plot_mm)), module_size$Module[order(module_size$n)]]
heatmap.2(plot_mm, xlab('Modules ordered by Size'), ylab('Genes ordered by mean expression'), keysize=1,
ColSideColors = brewer.pal(9,'PuBu')[module_size$quant[order(module_size$n)]],
RowSideColors = brewer.pal(9, 'RdPu')[cut(rowMeans(plot_mm), breaks=9, labels=FALSE)],
trace='none', dendrogram='none', scale='none', Rowv=FALSE, Colv=FALSE, col=brewer.pal(9,'Spectral'))
Letting the heatmap order the genes and modules by distance, it seems like the module size is not an important factor but the mean expression is. The modules in the right leg of the dendrogram seem to be associated to genes with low expression and the ones on the left leg to genes with high expression
heatmap.2(plot_mm, trace='none', col=brewer.pal(9,'Spectral'), dendrogram='column',
RowSideColors = brewer.pal(9, 'RdPu')[cut(rowMeans(plot_mm), breaks=9, labels=FALSE)],
ColSideColors = brewer.pal(9,'PuBu')[module_size$quant[order(module_size$n)]])
The biggest modules have the most extreme memberships (both positive and negative)
plot_mm = plot_mm[order(rowMeans(abs(plot_mm))), module_size$Module[order(module_size$n)]]
heatmap.2(abs(plot_mm), ColSideColors = brewer.pal(9,'PuBu')[module_size$quant[order(module_size$n)]], keysize=1,
RowSideColors = brewer.pal(9, 'RdPu')[cut(rowMeans(plot_mm), breaks=9, labels=FALSE)],
trace='none', dendrogram='none', scale='none', Rowv=FALSE, Colv=FALSE, col=brewer.pal(9,'YlOrRd'))
Checking if the mean expression of the modules plays a role
plot_mm = plot_mm[order(rowMeans(plot_mm)), module_size$Module[order(module_size$meanExpr)]]
heatmap.2(plot_mm, ColSideColors = brewer.pal(9,'RdPu')[module_size$quantME[order(module_size$meanExpr)]], keysize=1,
RowSideColors = brewer.pal(9, 'RdPu')[cut(rowMeans(plot_mm), breaks=9, labels=FALSE)],
trace='none', dendrogram='none', scale='none', Rowv=FALSE, Colv=FALSE, col=brewer.pal(9,'Spectral'))
plot_mm = plot_mm[order(rowMeans(plot_mm)), module_size$Module[order(module_size$meanExpr)]]
heatmap.2(plot_mm, ColSideColors = brewer.pal(9,'RdPu')[module_size$quantME[order(module_size$meanExpr)]], keysize=1,
RowSideColors = brewer.pal(9, 'RdPu')[cut(rowMeans(plot_mm), breaks=9, labels=FALSE)],
trace='none', dendrogram='none', scale='none', Rowv=TRUE, Colv=FALSE, col=brewer.pal(9,'Spectral'))
The mean expression pattern seems to be clearer in the samples than in the modules
The colors in the histogram seem to have inverted, with the red being the highest MM and blue de lowest (I have no idea how this happened)
plot_mm = plot_mm[order(rowMeans(plot_mm)), module_size$Module[order(module_size$meanExpr)]]
heatmap.2(plot_mm, ColSideColors = brewer.pal(9,'RdPu')[module_size$quantME[order(module_size$meanExpr)]], keysize=1,
RowSideColors = brewer.pal(9, 'RdPu')[cut(rowMeans(plot_mm), breaks=9, labels=FALSE)],
trace='none', dendrogram='column', scale='none', col=brewer.pal(9,'Spectral'))
Because of the weird inversion in the heatmap palette from above, check if there’s a relation between the gene’s level of expression and the average level of expression of the module it is assigned to: there is and it is positive, so something weird is happening in the heatmaps above
I’m not sure how to compare these two plots …
plot_data = data.frame('ID' = dataset$ID, 'GeneMeanExpr'=rowMeans(datExpr), 'Module' = gsub('#','',dataset$Module)) %>%
left_join(module_size, by='Module') %>% mutate('ModuleMeanExpr' = meanExpr)
MM_module_size = data.frame('Module' = original_max_membership) %>% group_by(Module) %>% tally()
MM_module_size$meanExpr = sapply(MM_module_size$Module, function(m){mean(rowMeans(datExpr)[original_max_membership==m |
original_max_membership==paste0('#',m)])})
MM_plot_data = data.frame('ID' = dataset$ID, 'GeneMeanExpr'=rowMeans(datExpr), 'Module' = gsub('#','',dataset$Module)) %>%
left_join(MM_module_size, by='Module') %>% mutate('ModuleMeanExpr' = meanExpr)
p1 = plot_data %>% ggplot(aes(ModuleMeanExpr, GeneMeanExpr)) + geom_point(alpha=0.2, color=gsub('#gray','gray',paste0('#',plot_data$Module))) +
geom_smooth(color='gray') + theme_minimal() + xlab('Assigned Module Mean Expression') + ylab('Gene Mean Expression') +
ggtitle(paste0('Cor = ', round(cor(plot_data$ModuleMeanExpr, plot_data$GeneMeanExpr),3),
' Regression slope = ', round(lm(GeneMeanExpr ~ ModuleMeanExpr, data=plot_data)$coefficients[[2]],2),
' R^2 = ', round(cor(plot_data$ModuleMeanExpr, plot_data$GeneMeanExpr)^2,3)))
p2 = MM_plot_data %>% ggplot(aes(ModuleMeanExpr, GeneMeanExpr)) + geom_point(alpha=0.2, color=gsub('#gray','gray',paste0('#',plot_data$Module))) +
geom_smooth(color='gray') + theme_minimal() + xlab('Highest MM Module Mean Expression') + ylab('Gene Mean Expression') +
ggtitle(paste0('Cor = ', round(cor(MM_plot_data$ModuleMeanExpr, MM_plot_data$GeneMeanExpr),3),
' Regression slope = ', round(lm(GeneMeanExpr ~ ModuleMeanExpr, data=MM_plot_data)$coefficients[[2]],2),
' R^2 = ', round(cor(MM_plot_data$ModuleMeanExpr, MM_plot_data$GeneMeanExpr)^2,3)))
grid.arrange(p1, p2, nrow = 1)
rm(p1, p2)
Assigning genes by MM increases the mean expression of the largest modules and decreases the mean expression of the smallest ones
plot_data = module_size %>% dplyr::select(Module, meanExpr, n) %>% rename(AssignedModuleME=meanExpr) %>%
left_join(MM_module_size, by='Module') %>% rename(MMModuleME=meanExpr)
ggplotly(plot_data %>% ggplot(aes(AssignedModuleME, MMModuleME, size=n.x)) + geom_abline(slope=1, intercept=0, color='gray') +
geom_point(color=gsub('#gray','gray',paste0('#',plot_data$Module)), alpha=0.5, aes(id=Module)) + ggtitle('Mean Expression') +
xlab('Mean Expression of Assigned Module') + ylab('Mean Expression of Module with Higest MM') + theme_minimal() + coord_fixed())
There doesn’t seem to be a relation between module size and level of expression, so the result from above probably has no effect on the level of expression patterns found before
plot_data = module_size %>% dplyr::select(Module, meanExpr, n)
ggplotly(plot_data %>% ggplot(aes(meanExpr, n)) + geom_point(color=gsub('#gray','gray',paste0('#',plot_data$Module))) +
geom_smooth(color='gray', alpha=0.1) + theme_minimal())
Assigning genes by MM balances more the size of the modules, increasing the smallest ones and reducing the largest ones. Mean expression doesn’t seem to be a important factor here.
plot_data = module_size %>% dplyr::select(Module, meanExpr, n) %>% rename(AssignedModuleN=n) %>%
left_join(MM_module_size, by='Module') %>% rename(MMModuleN=n)
ggplotly(plot_data %>% ggplot(aes(AssignedModuleN, MMModuleN, size=meanExpr.x)) + geom_abline(slope=1, intercept=0, color='gray') +
geom_smooth(color='gray', se=FALSE) + geom_point(color=gsub('#gray','gray',paste0('#',plot_data$Module)), alpha=0.5, aes(id=Module)) +
ggtitle('Module Size') + xlab('Size of Assigned Module') + ylab('Size of Module with Higest MM') + theme_minimal())
The module assigned to each gene and the module with the highest Module Membership don’t need to be the same and it’s fine, it’s because they are done with different methods
The biggest modules have the strongest MM values (both positive and negative)
There seems to be some relation between the mean level of expression of a gene and MM (this shouldn’t happen…)
If we use maximum MM as a criteria for assigning modules, the resulting modules have more balanced sizes (I still need to see how to study what happens with the mean level of expression bias)
sessionInfo()
## R version 3.6.0 (2019-04-26)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Scientific Linux 7.6 (Nitrogen)
##
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
##
## locale:
## [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_GB.UTF-8 LC_COLLATE=en_GB.UTF-8
## [5] LC_MONETARY=en_GB.UTF-8 LC_MESSAGES=en_GB.UTF-8
## [7] LC_PAPER=en_GB.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] knitr_1.24 polycor_0.7-10 expss_0.10.1
## [4] WGCNA_1.68 fastcluster_1.1.25 dynamicTreeCut_1.63-1
## [7] gplots_3.0.1.1 GGally_1.4.0 gridExtra_2.3
## [10] viridis_0.5.1 viridisLite_0.3.0 RColorBrewer_1.1-2
## [13] dendextend_1.13.2 plotly_4.9.1 glue_1.3.1
## [16] reshape2_1.4.3 forcats_0.4.0 stringr_1.4.0
## [19] dplyr_0.8.3 purrr_0.3.3 readr_1.3.1
## [22] tidyr_1.0.0 tibble_2.1.3 ggplot2_3.2.1
## [25] tidyverse_1.3.0
##
## loaded via a namespace (and not attached):
## [1] readxl_1.3.1 backports_1.1.5
## [3] Hmisc_4.2-0 plyr_1.8.5
## [5] lazyeval_0.2.2 splines_3.6.0
## [7] crosstalk_1.0.0 BiocParallel_1.20.1
## [9] GenomeInfoDb_1.22.0 robust_0.4-18.2
## [11] digest_0.6.23 foreach_1.4.7
## [13] htmltools_0.4.0 GO.db_3.10.0
## [15] gdata_2.18.0 fansi_0.4.1
## [17] magrittr_1.5 checkmate_1.9.4
## [19] memoise_1.1.0 fit.models_0.5-14
## [21] cluster_2.0.8 doParallel_1.0.15
## [23] annotate_1.64.0 modelr_0.1.5
## [25] matrixStats_0.55.0 colorspace_1.4-1
## [27] blob_1.2.0 rvest_0.3.5
## [29] rrcov_1.4-7 haven_2.2.0
## [31] xfun_0.8 crayon_1.3.4
## [33] RCurl_1.95-4.12 jsonlite_1.6
## [35] genefilter_1.68.0 zeallot_0.1.0
## [37] impute_1.60.0 survival_2.44-1.1
## [39] iterators_1.0.12 gtable_0.3.0
## [41] zlibbioc_1.32.0 XVector_0.26.0
## [43] DelayedArray_0.12.2 BiocGenerics_0.32.0
## [45] DEoptimR_1.0-8 scales_1.1.0
## [47] mvtnorm_1.0-11 DBI_1.1.0
## [49] Rcpp_1.0.3 xtable_1.8-4
## [51] htmlTable_1.13.1 foreign_0.8-71
## [53] bit_1.1-15.1 preprocessCore_1.48.0
## [55] Formula_1.2-3 stats4_3.6.0
## [57] htmlwidgets_1.5.1 httr_1.4.1
## [59] ellipsis_0.3.0 acepack_1.4.1
## [61] farver_2.0.3 XML_3.98-1.20
## [63] pkgconfig_2.0.3 reshape_0.8.8
## [65] nnet_7.3-12 dbplyr_1.4.2
## [67] locfit_1.5-9.1 later_1.0.0
## [69] labeling_0.3 tidyselect_0.2.5
## [71] rlang_0.4.2 AnnotationDbi_1.48.0
## [73] munsell_0.5.0 cellranger_1.1.0
## [75] tools_3.6.0 cli_2.0.1
## [77] generics_0.0.2 RSQLite_2.2.0
## [79] broom_0.5.3 fastmap_1.0.1
## [81] evaluate_0.14 yaml_2.2.0
## [83] bit64_0.9-7 fs_1.3.1
## [85] robustbase_0.93-5 caTools_1.17.1.2
## [87] nlme_3.1-139 mime_0.8
## [89] xml2_1.2.2 compiler_3.6.0
## [91] rstudioapi_0.10 reprex_0.3.0
## [93] geneplotter_1.64.0 pcaPP_1.9-73
## [95] stringi_1.4.5 lattice_0.20-38
## [97] Matrix_1.2-17 vctrs_0.2.1
## [99] pillar_1.4.3 lifecycle_0.1.0
## [101] data.table_1.12.8 bitops_1.0-6
## [103] httpuv_1.5.2 GenomicRanges_1.38.0
## [105] R6_2.4.1 latticeExtra_0.6-28
## [107] promises_1.1.0 KernSmooth_2.23-15
## [109] IRanges_2.20.2 codetools_0.2-16
## [111] MASS_7.3-51.4 gtools_3.8.1
## [113] assertthat_0.2.1 SummarizedExperiment_1.16.1
## [115] DESeq2_1.26.0 withr_2.1.2
## [117] S4Vectors_0.24.2 GenomeInfoDbData_1.2.2
## [119] mgcv_1.8-28 parallel_3.6.0
## [121] hms_0.5.3 grid_3.6.0
## [123] rpart_4.1-15 rmarkdown_1.14
## [125] Cairo_1.5-10 shiny_1.4.0
## [127] Biobase_2.46.0 lubridate_1.7.4
## [129] base64enc_0.1-3